The full analysis can be found in the series of python files:
https://github.com/ingonader/python-for-data-science-edx/tree/master/week-09-and-10-final-project
kgl-cycle-share-main-file.py
kgl-cycle-share-01-setup.py
kgl-cycle-share-02-data-download.py
kgl-cycle-share-03-data-load.py
kgl-cycle-share-04-data-prep.py
kgl-cycle-share-05-exploratory-analysis.py
kgl-cycle-share-06a-random-forest.py
kgl-cycle-share-06b-gradient-boosting-weather-only.py
kgl-cycle-share-06b-gradient-boosting-with-dewpoint.py
kgl-cycle-share-06b-gradient-boosting-with-imputation.py
kgl-cycle-share-06b-gradient-boosting.py
kgl-cycle-share-06c-xgboost.py
kgl-cycle-share-06d-pt01-classification-gb.py
kgl-cycle-share-06d-pt02-regression-gb.py
kgl-cycle-share-06d-pt03-combination.py
kgl-cycle-share-07-eval-model.py
## ========================================================================= ##
## import libraries
## ========================================================================= ##
import requests
import io
import zipfile
import os
import urllib.parse
import re ## for regular expressions
from itertools import chain ## for chain, similar to R's unlist (flatten lists)
import collections ## for Counters (used in frequency tables, for example)
import numpy as np
import csv
import pandas as pd
from pandas.api.types import CategoricalDtype ## for sorted plotnine/ggplot categories
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns ## for correlation heatmap
#from mpl_toolkits.basemap import Basemap
import folium
from scipy import stats
import patsy ## for design matrices like R
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
import pathvalidate as pv
## ========================================================================= ##
## global variables and options
## ========================================================================= ##
path_dat = './data'
path_out = './output'
pd.set_option('display.max_columns', 50)
## weather data: station_id, year_list, month_list for data download and data prep:
# station_id = 51157 # MONTREAL INTL A; QUEBEC
station_id = 10761 # "MCTAVISH", "QUEBEC"
year_list = [2014, 2015, 2016, 2017]
month_list = list(range(1, 13))
## cycle trip data:
## data for these years:
dat_years = [2014, 2015, 2016, 2017]
## "translation" from "quoted" to long variable / feature names:
varnames_long_dict = {
"Q('Month')" : "Month (1-12)",
"Q('Temp (°C)')" : "Temperature (°C)",
"Q('Rel Hum (%)')" : "Relative Humidity (%)",
"Q('Wind Dir (10s deg)')" : "Wind Direction (deg)",
"Q('Wind Spd (km/h)')" : "Wind Speed (km/h)",
"Q('Stn Press (kPa)')" : "Atmospheric Pressure (kPa)",
"Q('hr_of_day')" : "Hour of the Day (0-23)",
"Q('day_of_week')" : "Day of the Week (0-6)"
}
## "translation" of unquoted to long variable / feature names:
varnames_orig_long_dict = {
'trip_cnt' : 'Trip Count',
'Year' : 'Year (2014-2017)',
'Month' : 'Month (1-12)',
'Temp (°C)' : 'Temperature (°C)',
'Dew Point Temp (°C)' : 'Dew Point (°C)',
'Rel Hum (%)' : 'Relative Humidity (%)',
'Wind Dir (10s deg)' : 'Wind Direction (10s deg)',
'Wind Spd (km/h)' : 'Wind Speed (km/h)',
'Stn Press (kPa)' : 'Atmospheric Pressure (kPa)',
'hr_of_day' : 'Hour of the Day (0-23)',
'day_of_week' : 'Day of the Week (0-6)'
}
## ========================================================================= ##
## download data
## ========================================================================= ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## montreal bike share data
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## BIXI Montreal (public bicycle sharing system)
## Data on North America's first large-scale bike sharing system
## https://www.kaggle.com/aubertsigouin/biximtl/home
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## weather data from canadian government
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## station id available from:
## ftp://ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/
## ftp://ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv
## define station, years and month:
## (for downloading and data prep later)
## see setup.py file
for year in year_list:
for month in month_list:
## define url for bulk data download of http://climate.weather.gc.ca/climate_data :
url_string = 'http://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID={0}&Year={1}&Month={2}&Day=14&timeframe=1&submit=Download+Data'.format(station_id, year, month)
## define filename, as well as target path:
filename_target = 'weather_montreal_{0}_{1:04d}_{2:02d}.csv'.format(station_id, year, month)
path_dat = './data'
## check if file exists, only download if it does not exist:
if os.path.isfile(os.path.join(path_dat, filename_target)):
print('year {0:04d}, month {1:02d} -- file exists, skipped.'.format(year, month))
else:
## download (commented out in order not to repeat it every time):
r = requests.get(url_string, allow_redirects=True)
open(os.path.join(path_dat, filename_target), 'wb').write(r.content)
## display progress:
print('year {0:04d}, month {1:02d} -- downloaded.'.format(year, month))
print('data download complete.')
## ========================================================================= ##
## Load data files
## ========================================================================= ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## montreal bike share data
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## inspect trip data:
tmp = pd.read_csv('./data/OD_2014.csv').head()
print(tmp.dtypes)
tmp.head()
## inspect stations data:
print(pd.read_csv('./data/Stations_2014.csv').head().dtypes)
print(pd.read_csv('./data/Stations_2017.csv').head().dtypes)
## inspect stations data:
pd.read_csv('./data/Stations_2014.csv').head()
pd.read_csv('./data/Stations_2015.csv').head()
pd.read_csv('./data/Stations_2016.csv').head()
pd.read_csv('./data/Stations_2017.csv').head()
# ## data for these years:
# dat_years = [2014, 2015, 2016, 2017]
# ## see setup.py
## function to read each file in the same way:
def read_trip_csv(path_dat, year, filename_prefix = 'OD_', filename_suffix = '.csv'):
dtype = {'id': 'int64',
'start_date' : 'str',
'start_station_code' : 'str',
'end_date' : 'str',
'end_station_code' : 'str',
'duration_sec' : 'int64',
'is_member' : 'int64'}
names = list(dtype.keys())
parse_dates = ['start_date', 'end_date']
filename = filename_prefix + '{0}'.format(year) + filename_suffix
dat = pd.read_csv(os.path.join(path_dat, filename), names = names, skiprows = 1,
dtype = dtype, parse_dates = parse_dates)
return dat
#read_trip_csv(path_dat, 2014)
## function to read each Stations_ file in the same way:
def read_stations_csv(path_dat, year, filename_prefix = 'Stations_', filename_suffix = '.csv'):
dtype = {'code' : 'str',
'name' : 'str',
'latitude' : 'float',
'longitude' : 'float'}
## new column in 2017:
if (year == 2017):
dtype['is_public'] = 'int'
names = list(dtype.keys())
filename = filename_prefix + '{0}'.format(year) + filename_suffix
dat = pd.read_csv(os.path.join(path_dat, filename), names = names, skiprows = 1,
dtype = dtype)
## add is_public column if not available in data:
if (year < 2017):
dat['is_public'] = None
dat['year'] = year
return dat
#read_stations_csv(path_dat, 2014)
#read_stations_csv(path_dat, 2017)
## read all files into a list for trips and stations:
dat_trip_raw_list = [read_trip_csv(path_dat, i) for i in dat_years]
dat_stations_raw_list = [read_stations_csv(path_dat, i) for i in dat_years]
## make one pandas dataframe from lists (for trips and stations):
dat_trip_raw = pd.concat(dat_trip_raw_list, axis = 0)
dat_stations_raw = pd.concat(dat_stations_raw_list, axis = 0)
## basic inspection of raw trip data:
dat_trip_raw.info()
dat_trip_raw.shape # (14598961, 7)
type(dat_trip_raw) # pandas.core.frame.DataFrame
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## weather data from canadian government
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## get column types?
## inspect weather data:
tmp = pd.read_csv(os.path.join(path_dat, 'weather_montreal_10761_2014_01.csv'), skiprows = 15).head()
tmp = pd.read_csv(os.path.join(path_dat, 'weather_montreal_10761_2015_01.csv'), skiprows = 15).head()
print(tmp.dtypes)
tmp.head()
# year = 2017
# month = 12
# filename_prefix = 'weather_montreal_'
# filename_suffix = '.csv'
## function to read weather data csv
def read_weather_csv(path_dat, year, month, station_id,
cols = ['Date/Time', 'Year', 'Month', 'Temp (°C)',
'Dew Point Temp (°C)', 'Rel Hum (%)', 'Wind Dir (10s deg)',
'Wind Spd (km/h)', 'Stn Press (kPa)', 'Wind Chill'],
filename_prefix = 'weather_montreal_',
filename_suffix = '.csv'):
dtype = {'Date/Time' : 'str',
'Year' : 'int64',
'Month' : 'int64',
'Day' : 'int64',
'Time' : 'str',
'Temp (°C)' : 'float64',
'Temp Flag' : 'str',
'Dew Point Temp (°C)' : 'float64',
'Dew Point Temp Flag' : 'str',
'Rel Hum (%)' : 'float64',
'Rel Hum Flag' : 'str',
'Wind Dir (10s deg)' : 'float64',
'Wind Dir Flag' : 'str',
'Wind Spd (km/h)' : 'float64',
'Wind Spd Flag' : 'str',
'Visibility (km)' : 'float64',
'Visibility Flag' : 'str',
'Stn Press (kPa)' : 'float64',
'Stn Press Flag' : 'str',
'Hmdx' : 'float64',
'Hmdx Flag' : 'str',
'Wind Chill' : 'float64',
'Wind Chill Flag' : 'str',
'Weather' : 'str'}
names = list(dtype.keys())
parse_dates = ['Date/Time']
filename = filename_prefix + '{0}_{1:04d}_{2:02d}'.format(station_id, year, month) + filename_suffix
dat = pd.read_csv(os.path.join(path_dat, filename), skiprows = 16, dtype = dtype, names = names, parse_dates = parse_dates)
return dat[cols]
#read_weather_csv(path_dat, 2017, 1, station_id).head()
#read_weather_csv(path_dat, 2015, 1, station_id).head()
## read each file into a list:
dat_weather_raw_list = [read_weather_csv(path_dat, i, j, station_id) \
for i in year_list \
for j in month_list]
# for i in year_list:
# for j in month_list:
# tmp = read_weather_csv(path_dat, i, j, station_id)
## combine list into single dataframe:
dat_weather_raw = pd.concat(dat_weather_raw_list, axis = 0)
print(dat_weather_raw.info())
dat_weather_raw.head()
## ========================================================================= ##
## data preparation
## ========================================================================= ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## aggregate trip data
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## set time index for dataframe (in order to use `resample`):
dat_trip_raw.set_index(
pd.DatetimeIndex(dat_trip_raw['start_date']),
inplace = True) ## [[?]] use inplace = True?
## daily summary of trips:
dat_trip_day = pd.DataFrame()
dat_trip_day['trip_cnt'] = dat_trip_raw['start_date'].resample('24h').count()
dat_trip_day['duration_min_mean'] = dat_trip_raw['duration_sec'].resample('24h').mean() / 60
dat_trip_day['start_date'] = dat_trip_day.index
print(dat_trip_day.shape)
dat_trip_day.head()
## hourly summary of trips:
dat_trip_hr = pd.DataFrame()
dat_trip_hr['trip_cnt'] = dat_trip_raw['start_date'].resample('1h').count()
dat_trip_hr['duration_min_mean'] = dat_trip_raw['duration_sec'].resample('1h').mean() / 60
dat_trip_hr['start_date'] = dat_trip_hr.index
print(dat_trip_hr.shape)
dat_trip_hr.head()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## join trip data to weather data
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
dat_hr_all = pd.merge(
left = dat_trip_hr,
right = dat_weather_raw,
how = 'left',
left_on = 'start_date',
right_on = 'Date/Time')
#dat_hr_all.head()
#dat_hr_all.info()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## add further columns
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## add hour of day:
dat_hr_all['hr_of_day'] = dat_hr_all['start_date'].dt.hour
## add day of week:
dat_hr_all['day_of_week'] = dat_hr_all['start_date'].dt.weekday ## or weekday or weekday_name
## add trip indicator:
dat_hr_all['trip_ind'] = (dat_hr_all['trip_cnt'] > 0).astype(int)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## further data prep
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
#dat_hr_all.info()
## don't use wind chill, as too many missings: (only 6713 out of 30360)
del dat_hr_all['Wind Chill']
dat_hr_all.head()
## ========================================================================= ##
## exploratory analysis of stations
## ========================================================================= ##
weatherstation_latlon = [[45.5047416666667, -73.5791666666667],
[45.4705555555556, -73.7408333333333],
[45.4677777777778, -73.7416666666667],
[45.5, -73.85],
[45.5175, -73.4169444444445],
[45.5166666666667, -73.4166666666667],
[45.3833333333333, -73.4333333333333],
[45.7, -73.5],
[45.4272222222222, -73.9291666666667]]
weatherstation_name = ['MCTAVISH',
'MONTREAL INTL A',
'MONTREAL/PIERRE ELLIOTT TRUDEAU INTL',
'STE GENEVIEVE',
'MONTREAL/ST-HUBERT',
'MONTREAL/ST-HUBERT A',
'LAPRAIRIE',
'RIVIERE DES PRAIRIES',
'STE-ANNE-DE-BELLEVUE 1']
loc = list(dat_stations_raw[['latitude', 'longitude']].mean())
## basic map:
folium_map = folium.Map(location = loc, zoom_start = 11)
## add markers for bike stations:
for i in range(0, len(dat_stations_raw)):
folium.CircleMarker(location = list(dat_stations_raw[['latitude', 'longitude']].iloc[i]),
radius = 2, color = "red")\
.add_to(folium_map)
## add markers for possible weather stations:
for i in range(0, len(weatherstation_name)):
folium.Marker(location = weatherstation_latlon[i],
popup = weatherstation_name[i])\
.add_to(folium_map)
## save plot as html:
folium_map.save("map-of-bike-and-possible-weather-stations.html")
## display map in browser (open new tab!)
import webbrowser, os
webbrowser.open('file://' + os.path.realpath("map-of-bike-and-possible-weather-stations.html"))
## ========================================================================= ##
## exploratory analysis of trips
## ========================================================================= ##
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## generic data exploration
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
%matplotlib inline
dat_hr_all.hist(bins = 20, figsize = (18, 9))
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## most common starting stations
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## most common starting stations:
start_station_counter = collections.Counter(dat_trip_raw['start_station_code'])
start_station_counter.most_common(10)
## make pandas data frame for visualizing with ggplot/plotnine:
dat_start_station_freq = pd.DataFrame(
start_station_counter.most_common(20),
columns = ['start_station_code', 'frequency'])
dat_start_station_freq.rename(index = dat_start_station_freq['start_station_code'], inplace = True)
## frequency series (for sorting):
## (pandas series with index that corresponds to categories):
dat_start_station_freq['frequency']
## create list for sorting:
#station_list = dat_start_station_freq['start_station_code'].value_counts().index.tolist()
station_list = dat_start_station_freq['frequency'].index.tolist()
station_cat = CategoricalDtype(categories=station_list, ordered=True)
dat_start_station_freq['start_station_code_cat'] = \
dat_start_station_freq['start_station_code'].astype(str).astype(station_cat)
## plot counter data (frequency table, with identity relation):
## (sorting does not work here)
ggplot(dat_start_station_freq, aes(x = 'start_station_code_cat', y = 'frequency')) + \
geom_bar(stat = 'identity') + \
coord_flip()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## total number of trips
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## total number of trips:
dat_trip_raw.shape[0] # 14 598 961 (14 Mio)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## number of trips
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## histogram of number of trips per hour:
ggplot(dat_trip_hr, aes(x = 'trip_cnt')) + geom_histogram(bins = 100)
print(ggplot(dat_trip_hr[dat_trip_hr['trip_cnt'] > 0],
aes(x = 'trip_cnt')) + geom_histogram(bins = 100))
dat_trip_hr[['trip_cnt']].describe()
## histogram of number of trips per day:
ggplot(dat_trip_day, aes(x = 'trip_cnt')) + geom_histogram(bins = 100)
print(ggplot(dat_trip_day[dat_trip_day['trip_cnt'] > 0],
aes(x = 'trip_cnt')) + geom_histogram(bins = 100))
dat_trip_day[['trip_cnt']].describe()
## define window for rolling mean and calculate:
window = 14*24
dat_trip_hr["trip_cnt_rollmean"] = dat_trip_hr[["trip_cnt"]]\
.rolling(window = window, center = False).\
mean()
%matplotlib inline
## line plot of number of trips per hour:
p = ggplot(dat_trip_hr, aes(y = 'trip_cnt', x = 'start_date')) + \
geom_point(alpha = .05) + \
geom_smooth(method = 'mavg', method_args = {'window' : window},
color = 'red', se = False) + \
labs(
title = 'Number of bike trips from 2014 to 2017',
x = 'Date',
y = 'Number of trips per hour'
) + \
scale_x_date(date_labels = "%b\n%Y")
#geom_line(aes(y = 'trip_cnt_rollmean'), color = "lightgreen") + \
print(p)
## select subset of data and drop missing values:
dat_tmp = dat_trip_hr[["trip_cnt", "trip_cnt_rollmean"]].dropna()
## select another subset of this for only specified months (with rides):
summer_idx = (dat_trip_hr['start_date'].dt.month >= 5) & (dat_trip_hr['start_date'].dt.month <= 10)
dat_summer = dat_tmp[summer_idx]
dat_summer = dat_summer[["trip_cnt", "trip_cnt_rollmean"]].dropna()
# np.sum(dat_summer['trip_cnt'] == 0)
## correlation and explained variance of rolling mean and actual trip count:
rollmean_r = dat_trip_hr[["trip_cnt", "trip_cnt_rollmean"]].corr()
rollmean_r2 = rollmean_r ** 2
rollmean_r2
rollmean_r = dat_tmp.corr()
rollmean_r2 = rollmean_r ** 2
rollmean_r2
print("r^2 for moving average, all data = ", r2_score(dat_tmp[["trip_cnt"]], dat_tmp[["trip_cnt_rollmean"]]))
print("r^2 for moving average, summer data = ", r2_score(dat_summer[["trip_cnt"]], dat_summer[["trip_cnt_rollmean"]]))
## mean absolute error:
print("MAE for moving average, all data = ", mean_absolute_error(dat_tmp[["trip_cnt"]], dat_tmp[["trip_cnt_rollmean"]]))
print("MAE for moving average, summer data = ", mean_absolute_error(dat_summer[["trip_cnt"]], dat_summer[["trip_cnt_rollmean"]]))
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## weather data
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## line plot of number of some weather metrics
## select which metrics to plot (or not plot):
wch_cols = list(set(dat_weather_raw.columns) - set(['Date/Time', 'Year', 'Month']))
for i in wch_cols[:3]:
p = ggplot(dat_weather_raw, aes(y = i, x = 'Date/Time')) + \
geom_point(alpha = .05) + \
geom_smooth(method = 'mavg', method_args = {'window' : 14*24},
color = 'red', se = False)
print(p)
for i in wch_cols[3:6]:
p = ggplot(dat_weather_raw, aes(y = i, x = 'Date/Time')) + \
geom_point(alpha = .05) + \
geom_smooth(method = 'mavg', method_args = {'window' : 14*24},
color = 'red', se = False)
print(p)
for i in wch_cols[6:]:
p = ggplot(dat_weather_raw, aes(y = i, x = 'Date/Time')) + \
geom_point(alpha = .05) + \
geom_smooth(method = 'mavg', method_args = {'window' : 14*24},
color = 'red', se = False)
print(p)
dat_weather_raw.describe()
dat_weather_raw.info()
dat_weather_raw.head()
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## multivariate data plots
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## exploring trip_cnt relationships:
%matplotlib inline
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Temp (°C)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Wind Spd (km/h)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Dew Point Temp (°C)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Rel Hum (%)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Wind Dir (10s deg)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'Stn Press (kPa)', color = 'hr_of_day')) + \
geom_point(alpha = .1)
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
## check correlations
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
#dat_hr_all.columns
varnames_cor = ['trip_cnt', 'Year', 'Month', 'Temp (°C)',
'Dew Point Temp (°C)', 'Rel Hum (%)',
'Wind Dir (10s deg)', 'Wind Spd (km/h)',
'Stn Press (kPa)', 'hr_of_day', 'day_of_week']
dat_cor = dat_hr_all[varnames_cor]
## correlation:
dat_cor.corr()
cormat = dat_cor.corr()
## correlation heatmap:
## simple:
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(cormat, cmap = colormap)
varnames_heatmap_long = [varnames_orig_long_dict[i] for i in cormat.columns]
## more complex:
%matplotlib inline
#%matplotlib osx
fig, ax = plt.subplots(figsize = (10, 10))
#Generate Color Map, red & blue
colormap = sns.diverging_palette(220, 10, as_cmap = True)
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(cormat, cmap = colormap, annot = True, fmt = ".2f", center = 0)
## Apply axes tickmarks of more explicit variable names:
plt.xticks(np.arange(0, len(cormat.columns)) + .5, varnames_heatmap_long)
plt.yticks(np.arange(0, len(cormat.columns)) + .5, varnames_heatmap_long)
plt.show()
## ------------------------------------------------------------------------- ##
## define features and formula
## ------------------------------------------------------------------------- ##
## convert categorical variables to strings
## (in order for patsy to automatically dummy-code them without
## having to use the C() function):
# dat_hr_all['Month'] = dat_hr_all['Month'].astype('str')
# dat_hr_all['hr_of_day'] = dat_hr_all['hr_of_day'].astype('str')
## interesting:
## accuracy seems to be higher for non-categorical features!
## define target and features:
target = 'trip_cnt'
features = ['Month',
'Temp (°C)',
# 'Dew Point Temp (°C)', ## -- exclude, because highly correlated with Temp
'Rel Hum (%)',
'Wind Dir (10s deg)',
'Wind Spd (km/h)',
'Stn Press (kPa)',
'hr_of_day',
'day_of_week']
list(dat_hr_all)
## add patsy-quoting to features (for weird column names):
target = 'Q(\'' + target + '\')'
features = ['Q(\'' + i + '\')' for i in features]
## formula as text for patsy: without interactions
formula_txt = target + ' ~ ' + \
' + '.join(features) + ' - 1'
formula_txt
# ## try all twofold interactions, in order to
# ## find important ones via variable importance plots:
# formula_txt = target + ' ~ (' + ' + '.join(features) + ') ** 2 - 1'
# formula_txt
## create design matrices using patsy (could directly be used for modeling):
#patsy.dmatrix?
dat_y, dat_x = patsy.dmatrices(formula_txt, dat_hr_all,
NA_action = 'drop',
return_type = 'dataframe')
dat_x.head()
## other possibilities for dummy coding:
## * pd.get_dummies [[?]] which to use?
## ------------------------------------------------------------------------- ##
## train / test split
## ------------------------------------------------------------------------- ##
## Split the data into training/testing sets (using patsy/dmatrices):
dat_train_x, dat_test_x, dat_train_y, dat_test_y = train_test_split(
dat_x, dat_y, test_size = 0.1, random_state = 142)
## convert y's to Series (to match data types between patsy and non-patsy data prep:)
dat_train_y = dat_train_y[target]
dat_test_y = dat_test_y[target]
#dat_test_x.shape
## ------------------------------------------------------------------------- ##
## estimate model and evaluate fit and model assumptions
## ------------------------------------------------------------------------- ##
## Instantiate random forest estimator:
mod_gb = GradientBoostingRegressor(n_estimators = 100,
random_state = 42,
loss = 'ls',
learning_rate = 0.1,
max_depth = 20,
min_samples_split = 70,
min_samples_leaf = 30,
verbose = 0)
## ------------------------------------------------------------------------- ##
## Randomized Search Cross-validation
## ------------------------------------------------------------------------- ##
## [[here]] [[todo]]
## * different distributions to sample from? (double values, log scale?)
## (more reserach needed here)
# specify parameters and distributions to sample from:
param_distributions = {
"n_estimators" : stats.randint(50, 201),
"learning_rate" : [0.2, 0.1, 0.05], # stats.uniform(0.05, 0.2 - 0.05),
"max_depth" : stats.randint(4, 21),
#"min_samples_split" : stats.randint(40, 101),
"min_samples_leaf" : stats.randint(30, 61)
}
#stats.randint(1, 4).rvs(20)
n_iter = 40
mod_randsearch = RandomizedSearchCV(
estimator = mod_gb,
param_distributions = param_distributions,
n_iter = n_iter,
scoring = "r2", ## "roc_auc", # "neg_mean_squared_error", "neg_mean_absolute_error"
cv = 4, ## k-fold cross-validation for binary classification
verbose = 2,
random_state = 7,
n_jobs = -1)
mod_randsearch.fit(dat_train_x, dat_train_y)
## best parameters and score in CV:
print(mod_randsearch.best_params_)
print(mod_randsearch.best_score_)
## get best model (estimator):
mod_gb = mod_randsearch.best_estimator_
## ------------------------------------------------------------------------- ##
## use and inspect model
## ------------------------------------------------------------------------- ##
## Make predictions using the testing set
dat_test_pred = mod_gb.predict(dat_test_x)
dat_train_pred = mod_gb.predict(dat_train_x)
## Inspect model:
print("MSE in training set = ", mean_squared_error(dat_train_y, dat_train_pred))
print("MSE in test set = ", mean_squared_error(dat_test_y, dat_test_pred))
print("MAE in training set = ", mean_absolute_error(dat_train_y, dat_train_pred))
print("MAE in test set = ", mean_absolute_error(dat_test_y, dat_test_pred))
print("R^2 (r squared) in training set = ", r2_score(dat_train_y, dat_train_pred))
print("R^2 (r squared) in test set = ", r2_score(dat_test_y, dat_test_pred))
## ------------------------------------------------------------------------- ##
## save model to disk
## ------------------------------------------------------------------------- ##
## [[?]] who to persist models?
## * don't use pickle or joblib (unsafe and not persistent)
## see https://pyvideo.org/pycon-us-2014/pickles-are-for-delis-not-software.html or
## http://scikit-learn.org/stable/modules/model_persistence.html
## (3.4.2. Security & maintainability limitations)
from sklearn.externals import joblib
# filename_model = 'model_gradient_boosting_interactions.pkl'
filename_model = 'model_gradient_boosting.pkl'
## don't actually save it here, only save it in original analysis (.py files)
#joblib.dump(mod_gb, os.path.join(path_out, filename_model))
## ========================================================================= ##
## load model from disk
## ========================================================================= ##
filename_model = 'model_gradient_boosting.pkl'; filename_out_prefix = 'mod_gb_'; n_jobs = -2; comment_str = ""
## load model:
mod_this = joblib.load(os.path.join(path_out, filename_model))
## define number of grid points for pdp interaction plots:
num_grid_points_int = [20, 20]
num_grid_points_main = 40
dat_perf_metrics = {
'timestamp' : pd.Timestamp.today(),
'timestamp_str' : str(pd.Timestamp.today()),
'filename_model' : filename_model,
'filename_out_prefix' : filename_out_prefix
}
pd.DataFrame.from_dict(dat_perf_metrics, orient = "index")
## ========================================================================= ##
## make predictions and get model performance
## ========================================================================= ##
## [[here]]
## * select only summer data (month >= 5 and <= 9)
dat_test_summer_x = dat_test_x[(dat_test_x["Q('Month')"] >= 5) & (dat_test_x["Q('Month')"] <= 59)]
dat_test_summer_y = dat_test_y[(dat_test_x["Q('Month')"] >= 5) & (dat_test_x["Q('Month')"] <= 59)]
## Make predictions using the testing set
dat_train_pred = mod_this.predict(dat_train_x)
dat_test_pred = mod_this.predict(dat_test_x)
dat_test_summer_pred = mod_this.predict(dat_test_summer_x)
## Inspect model:
this_perf_metrics = {
'mse_train' : mean_squared_error(dat_train_y, dat_train_pred), # MSE in training set
'mse_test' : mean_squared_error(dat_test_y, dat_test_pred), # MSE in test set
'mae_train' : mean_absolute_error(dat_train_y, dat_train_pred), # MAE in training set
'mae_test' : mean_absolute_error(dat_test_y, dat_test_pred), # MAE in test set
'mae_test_summer' : mean_absolute_error(dat_test_summer_y, dat_test_summer_pred),
'r2_train' : r2_score(dat_train_y, dat_train_pred), # R^2 (r squared) in test set
'r2_test' : r2_score(dat_test_y, dat_test_pred), # R^2 (r squared) in test set
'r2_test_summer' : r2_score(dat_test_summer_y, dat_test_summer_pred)
}
#pd.DataFrame(this_perf_metrics, index = [0])
dat_perf_metrics.update(this_perf_metrics)
pd.DataFrame.from_dict(dat_perf_metrics, orient = "index")
## r2 for non-zero bike trips counts:
this_perf_metrics = {
'r2_train_nonzero' : r2_score(dat_train_y[dat_train_y > 0], dat_train_pred[dat_train_y > 0]),
'r2_test_nonzero' : r2_score(dat_test_y[dat_test_y > 0], dat_test_pred[dat_test_y > 0])
}
#pd.DataFrame(this_perf_metrics, index = [0])
dat_perf_metrics.update(this_perf_metrics)
pd.DataFrame.from_dict(dat_perf_metrics, orient = "index")
## ========================================================================= ##
## variable importance
## ========================================================================= ##
## variable importance:
var_imp = pd.DataFrame(
{'varname_q' : dat_train_x.columns, ## quoted
'varname_orig' : [i[3:-2] for i in dat_train_x.columns],
'importance' : list(mod_this.feature_importances_)})
dat_varnames_long = pd.DataFrame.from_dict(varnames_long_dict, orient = 'index', columns = ['varname'])
var_imp = pd.merge(var_imp, dat_varnames_long,
left_on = 'varname_q', right_index = True,
how = 'left')
## for missing "varnames" (not defined in dat_varnames_long, e.g., interactions),
## use varname_q instead:
var_imp['varname'] = np.where(pd.isnull(var_imp['varname']),
var_imp['varname_q'], var_imp['varname'])
var_imp.sort_values('importance', ascending = False, inplace = True)
var_imp.head(n = 15)
#print(var_imp[['varname', 'importance']].head(n = 15))
## sort variables by importance for plotting:
varname_list = list(var_imp.sort_values('importance')['varname'])
varname_cat = CategoricalDtype(categories = varname_list, ordered=True)
var_imp['varname_cat'] = \
var_imp['varname'].astype(str).astype(varname_cat)
## plot variable importance (15 most important):
p = ggplot(var_imp[:15], aes(y = 'importance', x = 'varname_cat')) + \
geom_bar(stat = 'identity') + \
labs(
title = "Feature importance",
x = "Feature",
y = "Importance") + \
coord_flip()
print(p)
## ========================================================================= ##
## plot data with predictions
## ========================================================================= ##
## make predictions for complete dataset:
dat_y['pred'] = mod_this.predict(dat_x)
dat_y.head()
## add to original dataset:
dat_hr_all = pd.merge(dat_hr_all,
dat_y[['pred']],
how = 'left',
left_index = True,
right_index = True)
## plot predictions vs. real value of target:
p = ggplot(dat_y, aes(x = "Q('trip_cnt')", y = 'pred')) + \
geom_point(alpha = .1) + \
geom_abline(intercept = 0, slope = 1,
color = "red",
alpha = .5,
linetype = "dashed",
size = 1.2) + \
labs(
title = "Predictions vs. true values",
x = "True value of hourly trip count",
y = "Prediction for hourly trip count")
print(p)
p = ggplot(dat_hr_all, aes(y = 'trip_cnt', x = 'start_date')) + \
geom_point(alpha = .05, color = 'black') + \
geom_point(aes(y = 'pred'), alpha = .05, color = 'orange') + \
geom_smooth(method = 'mavg', method_args = {'window' : 14*24},
color = 'red', se = False)
print(p)
## add comment to model or model run:
this_perf_metrics = {
'comment' : comment_str
}
dat_perf_metrics.update(this_perf_metrics)
#pd.DataFrame(dat_perf_metrics, index = [0])
pd.DataFrame.from_dict(dat_perf_metrics, orient = "index")
filename_model_runs = "model_runs.csv"
## check if file exists:
if os.path.isfile(os.path.join(path_out, filename_model_runs)):
dat_model_runs = pd.read_csv(os.path.join(path_out, filename_model_runs))
else:
dat_model_runs = pd.DataFrame()
## append current runs data:
dat_model_runs = pd.concat([dat_model_runs,
pd.DataFrame(dat_perf_metrics, index = [0])],
sort = True)
## sort columns by order of dict (others at the end):
colnames_dict = list(dat_perf_metrics.keys())
colnames_notindict = list(set(dat_model_runs.columns) - set(colnames_dict))
colnames_ordered = [colnames_dict, colnames_notindict]
colnames_ordered = list(chain(*colnames_ordered)) ## flatten
## reorder columns:
dat_model_runs = dat_model_runs[colnames_dict]
dat_model_runs
## ========================================================================= ##
## partial dependence plots: main effects
## ========================================================================= ##
from pdpbox import pdp, get_dataset, info_plots
# Package scikit-learn (PDP via function plot_partial_dependence() )
# http://scikit-learn.org/stable/auto_examples/ensemble/plot_partial_dependence.html
# Package PDPbox (ICE, c-ICE for single and multiple predictors)
# https://github.com/SauceCat/PDPbox
#pd.merge(dat_train_x, pd.DataFrame(dat_train_y), left_index = True, right_index = True)
#dat_train_x.join(dat_train_y) ## identical
#dat_train_x.columns
%matplotlib inline
plot_params_default = {
# plot title and subtitle
'title': '',
'subtitle': '',
'title_fontsize': 20,
'subtitle_fontsize': 12,
'font_family': 'Arial',
# matplotlib color map for ICE lines
'line_cmap': 'Blues',
'xticks_rotation': 0,
# pdp line color, highlight color and line width
'pdp_color': '#1A4E5D',
'pdp_hl_color': '#FEDC00',
'pdp_linewidth': 1.5,
# horizon zero line color and with
'zero_color': '#E75438',
'zero_linewidth': 1,
# pdp std fill color and alpha
'fill_color': '#66C2D7',
'fill_alpha': 0.2,
# marker size for pdp line
'markersize': 3.5,
}
plot_ylim_max = 2000 ## the same for all plots, for comparability.
# plot_params_default.update({
# 'title': 'Partial Dependence for: %s' % \
# varnames_long_dict[wch_feature]
# })
def construct_pdp(model, feature,
dataset_x = dat_train_x, dataset_y = dat_train_y,
num_grid_points = num_grid_points_main, n_jobs = n_jobs,
model_features = dat_train_x.columns):
## calculation for pdp (and then ice plot) for numeric feature:
pdp_current = pdp.pdp_isolate(
model = mod_this, dataset = dataset_x.join(dataset_y),
num_grid_points = num_grid_points, n_jobs = n_jobs, ## needs to be 1 for XGBoost model!
model_features = dataset_x.columns,
feature = feature)
## construct centered pdp plot for numeric features:
fig_center, axes_center = pdp.pdp_plot(
pdp_current, varnames_long_dict[feature],
center = True,
plot_params = plot_params_default
)
axes_center["pdp_ax"].set_ylabel("Number of bike rides per hour")
axes_center["pdp_ax"].set_title('Partial Dependence Plot for: %s' % \
varnames_long_dict[feature], y = 1)
## construct non-centered pdp plot for numeric features:
fig, axes = pdp.pdp_plot(
pdp_current, varnames_long_dict[feature],
center = False,
plot_params = plot_params_default
)
axes["pdp_ax"].set_ylabel("Number of bike rides per hour")
#axes["pdp_ax"].set_ylim(0, plot_ylim_max)
#axes["pdp_ax"].set_ylim(0, np.max(vars(pdp_current)['count_data']['count']))
axes["pdp_ax"].set_title('Partial Dependence Plot for: %s' % \
varnames_long_dict[feature], y = 1)
#axes["pdp_ax"].margins(0)
return pdp_current, fig_center, fig
def construct_ice_plot(pdp_current, feature):
## centered ice-plot for numeric feature:
fig_center, axes_center = pdp.pdp_plot(
pdp_current, varnames_long_dict[wch_feature], #wch_feature,
center = True,
plot_lines = True, frac_to_plot = 100, ## percentage!
x_quantile = False, plot_pts_dist = True, show_percentile = True,
plot_params = plot_params_default)
axes_center["pdp_ax"]["_pdp_ax"].set_ylabel("Number of bike rides per hour")
axes_center["pdp_ax"]["_pdp_ax"].set_title('Partial Dependence and ICE Plot for: %s' % \
varnames_long_dict[feature], y = 1.1)
## standard ice-plot for numeric feature:
fig, axes = pdp.pdp_plot(
pdp_current, varnames_long_dict[wch_feature], #wch_feature,
center = False,
plot_lines = True, frac_to_plot = 100, ## percentage!
x_quantile = False, plot_pts_dist = True, show_percentile = True,
plot_params = plot_params_default)
axes["pdp_ax"]["_pdp_ax"].set_ylabel("Number of bike rides per hour")
#axes["pdp_ax"]["_pdp_ax"].set_ylim(0, np.max(vars(pdp_current)['count_data']['count']))
axes["pdp_ax"]["_pdp_ax"].set_title('Partial Dependence and ICE Plot for: %s' % \
varnames_long_dict[feature], y = 1.1)
return fig_center, fig
def save_pdp_or_ice_plot(fig, feature, filename_stump):
filename_this = filename_out_prefix + filename_stump + \
pv.sanitize_python_var_name(feature) + ".jpg"
print("Saving ", filename_this)
fig.savefig(fname = os.path.join(path_out, filename_this),
dpi = 150, pad_inches = 0.025, bbox_inches = "tight")
return
## define features to plot:
#features
pdp_plot_features = ["Q('Temp (°C)')", "Q('Stn Press (kPa)')",
"Q('hr_of_day')", "Q('Rel Hum (%)')",
"Q('day_of_week')"]
## make and save pdp and ice box plots:
for wch_feature in pdp_plot_features:
pdp_current, fig_center, fig = construct_pdp(model = mod_this, feature = wch_feature)
fig_center
fig
fig_center, fig = construct_ice_plot(pdp_current, feature = wch_feature)
fig_center
fig
## formula as text for patsy: without interactions
## try all twofold interactions, in order to
## find important ones via variable importance plots:
formula_interact_txt = target + ' ~ (' + ' + '.join(features) + ') ** 2 - 1'
formula_interact_txt
## create design matrices using patsy (could directly be used for modeling):
#patsy.dmatrix?
dat_interact_y, dat_interact_x = patsy.dmatrices(formula_interact_txt, dat_hr_all,
NA_action = 'drop',
return_type = 'dataframe')
## ------------------------------------------------------------------------- ##
## train / test split
## ------------------------------------------------------------------------- ##
## Split the data into training/testing sets (using patsy/dmatrices):
dat_interact_train_x, dat_interact_test_x, dat_interact_train_y, dat_interact_test_y = train_test_split(
dat_interact_x, dat_interact_y, test_size = 0.1, random_state = 142)
## convert y's to Series (to match data types between patsy and non-patsy data prep:)
dat_interact_train_y = dat_interact_train_y[target]
dat_interact_test_y = dat_interact_test_y[target]
## ------------------------------------------------------------------------- ##
## estimate model and evaluate fit and model assumptions
## ------------------------------------------------------------------------- ##
## Instantiate random forest estimator:
mod_interact_gb = GradientBoostingRegressor(n_estimators = 100,
random_state = 42,
loss = 'ls',
learning_rate = 0.1,
max_depth = 20,
min_samples_split = 70,
min_samples_leaf = 30,
verbose = 1)
## Train the model using the training sets:
mod_interact_gb.fit(dat_interact_train_x, dat_interact_train_y)
## ------------------------------------------------------------------------- ##
## use and inspect model
## ------------------------------------------------------------------------- ##
## Make predictions using the testing set
dat_interact_test_pred = mod_interact_gb.predict(dat_interact_test_x)
dat_interact_train_pred = mod_interact_gb.predict(dat_interact_train_x)
## Inspect model:
print("MSE in training set = ", mean_squared_error(dat_interact_train_y, dat_interact_train_pred))
print("MSE in test set = ", mean_squared_error(dat_interact_test_y, dat_interact_test_pred))
print("MAE in training set = ", mean_absolute_error(dat_interact_train_y, dat_interact_train_pred))
print("MAE in test set = ", mean_absolute_error(dat_interact_test_y, dat_interact_test_pred))
print("R^2 (r squared) in training set = ", r2_score(dat_interact_train_y, dat_interact_train_pred))
print("R^2 (r squared) in test set = ", r2_score(dat_interact_test_y, dat_interact_test_pred))
## ========================================================================= ##
## variable importance (interactions)
## ========================================================================= ##
## variable importance:
var_interact_imp = pd.DataFrame(
{'varname_q' : dat_interact_train_x.columns, ## quoted
'varname_orig' : [i[3:-2] for i in dat_interact_train_x.columns],
'importance' : list(mod_interact_gb.feature_importances_)})
dat_interact_varnames_long = pd.DataFrame.from_dict(varnames_long_dict, orient = 'index', columns = ['varname'])
var_interact_imp = pd.merge(var_interact_imp, dat_interact_varnames_long,
left_on = 'varname_q', right_index = True,
how = 'left')
## for missing "varnames" (not defined in dat_varnames_long, e.g., interactions),
## use varname_q instead:
var_interact_imp['varname'] = np.where(pd.isnull(var_interact_imp['varname']),
var_interact_imp['varname_q'], var_interact_imp['varname'])
var_interact_imp.sort_values('importance', ascending = False, inplace = True)
var_interact_imp.head(n = 15)
#print(var_imp[['varname', 'importance']].head(n = 15))
## sort variables by importance for plotting:
varname_interact_list = list(var_interact_imp.sort_values('importance')['varname'])
varname_interact_cat = CategoricalDtype(categories = varname_interact_list, ordered=True)
var_interact_imp['varname_cat'] = \
var_interact_imp['varname'].astype(str).astype(varname_interact_cat)
## plot variable importance (15 most important):
p = ggplot(var_interact_imp[:15], aes(y = 'importance', x = 'varname_cat')) + \
geom_bar(stat = 'identity') + \
labs(
title = "Feature importance",
x = "Feature",
y = "Importance") + \
coord_flip() + \
theme(axis_text = element_text(size = 12))
print(p)
## ========================================================================= ##
## partial dependence plots: interactions
## ========================================================================= ##
plot_params_pdp_int_default = {
# plot title and subtitle
'title': '',
'subtitle': '',
'title_fontsize': 15,
'subtitle_fontsize': 12,
# color for contour line
'contour_color': 'white',
'font_family': 'Arial',
# matplotlib color map for interact plot
'cmap': 'viridis',
# fill alpha for interact plot
'inter_fill_alpha': 0.8,
# fontsize for interact plot text
'inter_fontsize': 9,
}
def construct_pdp_interact(model, feature_names,
dataset_x = dat_train_x, dataset_y = dat_train_y,
num_grid_points = num_grid_points_int, n_jobs = n_jobs,
model_features = dat_train_x.columns):
inter_current = pdp.pdp_interact(
model = model, dataset = dataset_x.join(dataset_y),
num_grid_points = num_grid_points, n_jobs = n_jobs, ## needs to be 1 for XGBoost model!
model_features = model_features, features = feature_names)
fig, axes = pdp.pdp_interact_plot(
inter_current, feature_names = feature_names, x_quantile = False,
plot_type = 'contour', plot_pdp = False,
plot_params = plot_params_pdp_int_default)
axes["pdp_inter_ax"].set_xlabel(varnames_long_dict[feature_names[0]])
axes["pdp_inter_ax"].set_ylabel(varnames_long_dict[feature_names[1]])
## [[here]] y-labels!
axes["pdp_inter_ax"].set_title('Number of bike rides per hour\n(Partial Dependence Plot) for\n{0} and {1}\n'\
.format(varnames_long_dict[feature_names[0]],
varnames_long_dict[feature_names[1]]),
y = 1)
return fig
def save_pdp_int_plot(fig, features, filename_stump):
filename_this = filename_out_prefix + filename_stump + \
pv.sanitize_python_var_name(features[0]) + "--" + \
pv.sanitize_python_var_name(features[1]) + ".jpg"
print("Saving ", filename_this)
fig.savefig(fname = os.path.join(path_out, filename_this),
dpi = 150, pad_inches = .025, bbox_inches = "tight")
return
## define feature combinations to plot:
pdp_plot_int_feature_pairs = [
["Q('hr_of_day')", "Q('Temp (°C)')", ],
["Q('hr_of_day')", "Q('Stn Press (kPa)')"],
["Q('Month')", "Q('Stn Press (kPa)')"],
["Q('Rel Hum (%)')", "Q('Temp (°C)')"],
["Q('Stn Press (kPa)')", "Q('Temp (°C)')"],
["Q('Stn Press (kPa)')", "Q('day_of_week')"],
["Q('hr_of_day')", "Q('day_of_week')"]
]
## make and save pdp interaction plots:
for wch_features in pdp_plot_int_feature_pairs[:3]:
fig = construct_pdp_interact(model = mod_this,
#num_grid_points = [4, 4],
feature_names = wch_features)
fig
## make and save pdp interaction plots:
for wch_features in pdp_plot_int_feature_pairs[3:6]:
fig = construct_pdp_interact(model = mod_this,
#num_grid_points = [4, 4],
feature_names = wch_features)
fig
## make and save pdp interaction plots:
for wch_features in pdp_plot_int_feature_pairs[6:]:
fig = construct_pdp_interact(model = mod_this,
#num_grid_points = [4, 4],
feature_names = wch_features)
fig